knitr::opts_chunk$set(echo = TRUE)
# libraries
library(tidyverse)
library(dplyr)
library(readxl)
library(ggplot2)
library(wesanderson)
library(RColorBrewer)
library(lattice)
library(car)
library(MASS)
library(broom)

# set directory
setwd("~/Desktop/Cerner Data")

# import data
asp_raw <- read_csv("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/asp_cohort_w_denominators_first_enc_year_uncollapsed.csv")

# weighted data
weights <- read_csv("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/For_Brittany_adults_weighted_cerner_data_12Mar25.csv")
forweights <- read_csv("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/forweights.csv")

Data Wrangling and Exploration

glimpse(asp_raw)
## Rows: 258,882
## Columns: 12
## $ ...1          <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ year          <dbl> 2023, 2022, 2016, 2018, 2023, 2017, 2014, 2017, 2022, 20…
## $ prefstate     <chr> "OR", "NM", "HI", "ND", "NJ", "TX", "PA", "KY", "ME", "I…
## $ prefurban     <chr> "rural", "urban", "urban", "urban", "urban", "urban", "u…
## $ prefrace      <chr> "White", "White", "Native Hawaiian or Other Pacific Isla…
## $ prefethnicity <chr> "Non-Hispanic", "Non-Hispanic", "Hispanic or Latino", "N…
## $ prefgender    <chr> "Female", "Male", "Male", "Female", "Female", "Female", …
## $ age_group     <chr> "65 and Over", "35 to 64", "35 to 64", "35 to 64", "18 t…
## $ n_rwdpts      <dbl> 11011, 3183, 31, 37, 5694, 3611, 2316, 149, 70, 310, 311…
## $ n_rwdenc      <dbl> 83713, 17773, 237, 146, 34244, 15949, 7404, 212, 369, 10…
## $ n_fungal      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ n_fungalenc   <dbl> 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
summary(asp_raw)
##       ...1             year       prefstate          prefurban        
##  Min.   :     0   Min.   :2013   Length:258882      Length:258882     
##  1st Qu.: 64720   1st Qu.:2016   Class :character   Class :character  
##  Median :129440   Median :2018   Mode  :character   Mode  :character  
##  Mean   :129440   Mean   :2018                                        
##  3rd Qu.:194161   3rd Qu.:2021                                        
##  Max.   :258881   Max.   :2023                                        
##    prefrace         prefethnicity       prefgender         age_group        
##  Length:258882      Length:258882      Length:258882      Length:258882     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     n_rwdpts           n_rwdenc         n_fungal         n_fungalenc      
##  Min.   :     1.0   Min.   :     1   Min.   : 0.00000   Min.   :  0.0000  
##  1st Qu.:     2.0   1st Qu.:     5   1st Qu.: 0.00000   1st Qu.:  0.0000  
##  Median :    10.0   Median :    29   Median : 0.00000   Median :  0.0000  
##  Mean   :   492.2   Mean   :  2178   Mean   : 0.08021   Mean   :  0.2077  
##  3rd Qu.:    71.0   3rd Qu.:   217   3rd Qu.: 0.00000   3rd Qu.:  0.0000  
##  Max.   :153027.0   Max.   :908579   Max.   :96.00000   Max.   :189.0000
asp <- asp_raw %>%
  dplyr::select(year, state = prefstate, urban = prefurban, race = prefrace,
                ethnicity = prefethnicity, gender = prefgender, age = age_group,
                n_rwdpts, n_fungal)

1) Look at distribution of RWD patients over time

a. number

# how many  patient-years total?
asp %>%
  summarize(n = sum(n_rwdpts))
## # A tibble: 1 × 1
##           n
##       <dbl>
## 1 127414388
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_rwdpts)) +
  geom_bar(stat = "identity", width = 0.7, fill = "#91BAB6") +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = wes_palette("Darjeeling1", n = 1)) + 
  theme_minimal()
p

b. race

# race of all RWD patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_rwdpts, fill = race)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_brewer(palette = "Set2") + 
  theme_minimal()
p

White or Caucasian individuals make up the majority of Cerner patients captured in RWD, each year. Black or African American as well as some other race and not mapped/missing make up the remaining majority.

c. ethnicity

# ethnicity of all RWD patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_rwdpts, fill = ethnicity)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = wes_palette("Darjeeling1", n = 4)) + 
  theme_minimal()
p

Most patients in RWD are not Hispanic or Latino. The number of Hispanic or Latino patients seems to be increasing over time.

d. age

# age group of all RWD patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_rwdpts, fill = age)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = wes_palette("Darjeeling1", n = 3)) + 
  theme_minimal()
p

Most patients in RWD are 35 to 64. The number of patients in all age categories increases over time.

e. sex/gender

# sex/gender of all RWD patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_rwdpts, fill = gender)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = wes_palette("Darjeeling1", n = 3)) + 
  theme_minimal()
p

There appears to be slightly more females in RWD than males each year.

f. urban/rural

# urban/rural of all RWD patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_rwdpts, fill = urban)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = wes_palette("Darjeeling1", n = 3)) + 
  theme_minimal()
p

The majority of patients live in urban areas.

2) Look at distribution of aspergillosis patients over time

a. number

p <- ggplot(data = asp, aes(x = as.factor(year), y = n_fungal)) +
  geom_bar(stat = "identity", width = 0.7, fill = "#91BAB6") +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal()
p + labs(title = "Distribution of patients with aspergillosis diagnosis over study period", 
       y = "Number of Patients", x = "Year")

asp_prev <- asp %>%
  group_by(year) %>%
  summarize(n_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts)) %>%
  mutate(rwd_prev = (n_tot/rwd_tot)*100)

p <- ggplot(data = asp_prev, aes(x = as.factor(year), y = rwd_prev, group = 1)) +
  geom_line() +
  geom_point() +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal()
p + labs(title = "Prevalence of patients with aspergillosis diagnosis over study period", 
       y = "Prevalence (%)", x = "Year")

# how many aspergillosis patients total?
asp %>%
  summarize(n = sum(n_fungal))
## # A tibble: 1 × 1
##       n
##   <dbl>
## 1 20764
forweights %>%
  summarize(n = sum(n_fungal))
## # A tibble: 1 × 1
##       n
##   <dbl>
## 1 20764
# how many aspergillosis patients per year?
n_patients <- asp %>%
  group_by(year) %>%
  summarize(n_tot = sum(n_fungal))

n_patients
## # A tibble: 11 × 2
##     year n_tot
##    <dbl> <dbl>
##  1  2013   694
##  2  2014   854
##  3  2015  1057
##  4  2016  1388
##  5  2017  1673
##  6  2018  1998
##  7  2019  2197
##  8  2020  2349
##  9  2021  2858
## 10  2022  2879
## 11  2023  2817
Weighted prevalence
# join weights to data
weighted <- left_join(forweights, weights, by = c("year", "state", "gender", 
                                                  "eth_race", "age_group")) %>%
  na.omit()

# annual prevalence
weighted_annual <- weighted %>%
  mutate(weight_case = n_fungal*weights_trim,
         weight_n = n_rwdpts*weights_trim) %>%
  group_by(year) %>%
  summarize(fungal_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts), 
            weight_fungal_tot = sum(weight_case), weight_rwd_tot = sum(weight_n)) %>%
  mutate(prevalence = fungal_tot/rwd_tot*100000,
         SE = sqrt((weight_fungal_tot/weight_rwd_tot) * 
                     (1 - (weight_fungal_tot/weight_rwd_tot))/weight_rwd_tot)*100000,
         ci_lower = prevalence - 1.96*SE,
         ci_upper = prevalence + 1.96*SE)

# state overall prevalence
weighted_state <- weighted %>%
  mutate(weight_case = n_fungal*weights_trim,
         weight_n = n_rwdpts*weights_trim) %>%
  group_by(state) %>%
  summarize(fungal_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts), 
            weight_fungal_tot = sum(weight_case), weight_rwd_tot = sum(weight_n)) %>%
  mutate(prevalence = (weight_fungal_tot/weight_rwd_tot)*100000)
         

# overall prevalence
weighted_all <- weighted %>%
  mutate(weight_case = n_fungal*weights_trim,
         weight_n = n_rwdpts*weights_trim) %>%
  summarize(prev = sum(weight_case), denom = sum(weight_n)) %>%
  mutate(prevalence = prev/denom*100000)
# graph
p <- ggplot(data = weighted_annual, aes(x = as.factor(year), y = prevalence, group = 1)) +
  geom_line() +
  geom_point() +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
  scale_y_continuous(labels = scales::comma, limits = c(0, NA)) +
  theme_minimal()
p2 <- p + labs(title = "", 
       y = "Weighted Prevalence per 100,000 person-years", x = "Year")

p2

# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/weighted_prev.png", plot = p2, dpi = 300)
# map

# bring in state shapefile 
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(tmap)
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')
library(tmaptools)
library(stringr)
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
states <- states(cb = TRUE)
## Retrieving data for the year 2021
##   |                                                                              |                                                                      |   0%  |                                                                              |                                                                      |   1%  |                                                                              |=                                                                     |   1%  |                                                                              |==                                                                    |   3%  |                                                                              |===                                                                   |   4%  |                                                                              |=====                                                                 |   7%  |                                                                              |======                                                                |   8%  |                                                                              |======                                                                |   9%  |                                                                              |=======                                                               |   9%  |                                                                              |==============                                                        |  20%  |                                                                              |============================                                          |  40%  |                                                                              |====================================================                  |  74%  |                                                                              |==================================================================    |  95%  |                                                                              |======================================================================| 100%
# add state geography
weight_map <- weighted_state %>%
  left_join(states, by = c("state" = "STUSPS")) %>%
  dplyr::select(state, weight_fungal_tot, weight_rwd_tot, prevalence, geometry) %>%
  na.omit()
weight_map <- st_as_sf(x = weight_map)

# split US into three: contiguous, Alaska, and Hawaii
us_cont <- weight_map %>%
  filter(!state %in% c("AK", "HI"))
us_al <- weight_map %>%
  filter(state == "AK")
us_hi <- weight_map %>%
  filter(state == "HI")

# plot contiguous states
weightrt <- tm_shape(us_cont, projection = 2163) +
  tm_polygons(col = "prevalence", style = "jenks",
              title = "Weighted Prevalence per 100,000 Individuals",
              palette = c("#91BAB6", "#BDC881", "#E3B710", "#EC7A05", "#F11B00"),
              border.col = "black",
              border.alpha = 0.1,
              legend.show = FALSE) +
  tm_layout(frame = FALSE,
            legend.position = c("right", "bottom"),
            inner.margins = c(.25, .02, .02, .02)) +
  tm_add_legend(type = "fill", 
    labels = c("0 to <9.5", "9.5 to <13.8", "13.8 to <21.4", "21.4 to <30.1", ">30.1"),
    col = c("#91BAB6", "#BDC881", "#E3B710", "#EC7A05", "#F11B00"),
    border.lwd = 0.5,
    title = "Weighted Prevalence per 100,000 Individuals")

# create inset map of Alaska
weight_ak <- tm_shape(us_al, projection = 3338) +
  tm_fill(col = "#91BAB6",
          border.col = "black",
          border.alpha = 0.1,
          legend.show = FALSE) +
  tm_layout(frame = FALSE)

# create inset map of Hawaii
weight_hi <- tm_shape(us_hi, projection = 3759) +
  tm_fill(col = "#E3B710",
          border.col = "black",
          border.alpha = 0.1,
          legend.show = FALSE) +
  tm_layout(frame = FALSE)
  
# plot insets
library(grid)
weightrt
print(weight_ak, vp = viewport(x = .15, y = .15, width = .3, height = .3))
print(weight_hi, vp = viewport(x = .00001, y = .5, width = .3, height = .3))

b. race

# race of aspergillosis patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_fungal, fill = race)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_brewer(palette = "Set2") + 
  theme_minimal()
p + labs(title = "Distribution of patients with aspergillosis diagnosis race over study period", 
       y = "Number of Patients", x = "Year")

The majority of aspergillosis patients each year are White or Caucasian. The number of Asian patients with aspergillosis appears to increase slightly over time, as does the number of Black or African American and those of some other race.

c. ethnicity

# ethnicity of aspergillosis patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_fungal, fill = ethnicity)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = wes_palette("Darjeeling1", n = 4)) + 
  theme_minimal()
p + labs(title = "Distribution of patients with aspergillosis diagnosis ethnicity over study period", 
       y = "Number of Patients", x = "Year")

Most aspergillosis patients in RWD are not Hispanic or Latino. The number of Hispanic or Latino patients does increase over time.

d. age

# age group of aspergillosis patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_fungal, fill = age)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = wes_palette("Darjeeling1", n = 3)) + 
  theme_minimal()
p

Most aspergillosis patients are 35 to 64 or 65 and over.

e. sex/gender

# sex/gender of aspergillosis patients
p <- ggplot(data = asp, aes(x = as.factor(year), y = n_fungal, fill = gender)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = wes_palette("Darjeeling1", n = 3)) + 
  theme_minimal()
p + labs(title = "Distribution of patients with aspergillosis diagnosis sex over study period", 
       y = "Number of Patients", x = "Year")

There appears to be a relatively even split in the number of aspergillosis patients by sex/gender each year.

3) Proportion of aspergillosis patients compared to proportion of total patients

a. race

# race groups
race_grp1 <- asp %>%
  group_by(year, race) %>%
  summarize(race_fung_tot = sum(n_fungal), race_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
race_grp2 <- asp %>%
  group_by(year) %>%
  summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))

race_grp <- left_join(race_grp1, race_grp2, by = "year") %>%
  mutate(racefung_of_racetot = (race_fung_tot/race_rwd_tot)*100, 
         racefung_of_fungtot = (race_fung_tot/fung_tot)*100,
         race_of_tot = (race_rwd_tot/rwd_tot)*100) %>%
  pivot_longer(cols = c("racefung_of_fungtot", "race_of_tot"), 
               names_to = "variable",
               values_to = "percent")

race_grp_tbl <- race_grp %>%
  group_by(race) %>%
  summarize(race_fung_tot = sum(race_fung_tot), race_rwd_tot = sum(race_rwd_tot))
       
ggplot(race_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~race) +
  scale_x_discrete(name = "Year", 
                   labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
                              "'21", "'22", "23")) +
  scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
                    name = "", labels = c("% of all patients", 
                                          "% of aspergillosis patients")) + 
  theme_minimal()

Very few cases occur in AI/AN, Asian, and NH/API patients - making visual comparisons for these groups difficult. Black or African American patients appear to make up a similar proportion of aspergillosis patients relative to their proportion in the RWD patient totals. White or Caucasian patients appear over represented in aspergillosis cases compared to their proportion in the RWD patient totals for every year. Individuals identifying as some other race appear consistently over represented compared to their RWD patient totals.

b. ethnicity

# ethnic groups
eth_grp1 <- asp %>%
  group_by(year, ethnicity) %>%
  summarize(eth_fung_tot = sum(n_fungal), eth_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
eth_grp2 <- asp %>%
  group_by(year) %>%
  summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))

eth_grp <- left_join(eth_grp1, eth_grp2, by = "year") %>%
  mutate(ethfung_of_ethtot = (eth_fung_tot/eth_rwd_tot)*100, 
         ethfung_of_fungtot = (eth_fung_tot/fung_tot)*100,
         eth_of_tot = (eth_rwd_tot/rwd_tot)*100) %>%
  pivot_longer(cols = c("ethfung_of_fungtot", "eth_of_tot"), names_to = "variable",
               values_to = "percent")
       
ggplot(eth_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ethnicity) +
  scale_x_discrete(name = "Year", 
                   labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
                              "'21", "'22", "23")) +
  scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
                    name = "", labels = c("% of all patients", 
                                          "% of aspergillosis patients")) +
  theme_minimal()

There are a few years where patients that are Hispanic or Latino appear over represented in aspergillosis patients. Patients that are not Hispanic or Latino appear over represented in aspergillosis patients most years until 2019.

c. sex/gender

# sex/gender groups
sex_grp1 <- asp %>%
  group_by(year, gender) %>%
  summarize(sex_fung_tot = sum(n_fungal), sex_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
sex_grp2 <- asp %>%
  group_by(year) %>%
  summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))

sex_grp <- left_join(sex_grp1, sex_grp2, by = "year") %>%
  mutate(sexfung_of_sextot = (sex_fung_tot/sex_rwd_tot)*100, 
         sexfung_of_fungtot = (sex_fung_tot/fung_tot)*100,
         sex_of_tot = (sex_rwd_tot/rwd_tot)*100) %>%
  pivot_longer(cols = c("sexfung_of_fungtot", "sex_of_tot"), names_to = "variable",
               values_to = "percent")
       
ggplot(sex_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~gender) +
    scale_x_discrete(name = "Year", 
                   labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
                              "'21", "'22", "23")) +
  scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
                    name = "", labels = c("% of all patients", 
                                          "% of aspergillosis patients")) + 
  theme_minimal()

Male patients are over represented in aspergillosis cases. Each year, a larger percent of all aspergillosis patients were male than their percent in the RWD patient universe.

d. age

# age groups
age_grp1 <- asp %>%
  group_by(year, age) %>%
  summarize(age_fung_tot = sum(n_fungal), age_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
age_grp2 <- asp %>%
  group_by(year) %>%
  summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))

age_grp <- left_join(age_grp1, age_grp2, by = "year") %>%
  mutate(agefung_of_agetot = (age_fung_tot/age_rwd_tot)*100, 
         agefung_of_fungtot = (age_fung_tot/fung_tot)*100,
         age_of_tot = (age_rwd_tot/rwd_tot)*100) %>%
  pivot_longer(cols = c("agefung_of_fungtot", "age_of_tot"), names_to = "variable",
               values_to = "percent")
       
ggplot(age_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~age) +
    scale_x_discrete(name = "Year", 
                   labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
                              "'21", "'22", "23")) +
  scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
                    name = "", labels = c("% of all patients", 
                                          "% of aspergillosis patients")) + 
  theme_minimal()

Patients aged 65 years and older are very over represented in aspergillosis patients every year. Other age groups are under represented

e. urban/rural

# urban/rural
urban_grp1 <- asp %>%
  group_by(year, urban) %>%
  summarize(urban_fung_tot = sum(n_fungal), urban_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
urban_grp2 <- asp %>%
  group_by(year) %>%
  summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))

urban_grp <- left_join(urban_grp1, urban_grp2, by = "year") %>%
  mutate(urbanfung_of_urbantot = (urban_fung_tot/urban_rwd_tot)*100, 
         urbanfung_of_fungtot = (urban_fung_tot/fung_tot)*100,
         urban_of_tot = (urban_rwd_tot/rwd_tot)*100) %>%
  pivot_longer(cols = c("urbanfung_of_fungtot", "urban_of_tot"), names_to = "variable",
               values_to = "percent")
       
ggplot(urban_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~urban) +
    scale_x_discrete(name = "Year", 
                   labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
                              "'21", "'22", "23")) +
  scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
                    name = "", labels = c("% of all patients", 
                                          "% of aspergillosis patients")) + 
  theme_minimal()

f. state

# state
state_grp1 <- asp %>%
  group_by(year, state) %>%
  summarize(state_fung_tot = sum(n_fungal), state_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
state_grp2 <- asp %>%
  group_by(year) %>%
  summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))

state_grp <- left_join(state_grp1, state_grp2, by = "year") %>%
  mutate(statefung_of_statetot = (state_fung_tot/state_rwd_tot)*100, 
         statefung_of_fungtot = (state_fung_tot/fung_tot)*100,
         state_of_tot = (state_rwd_tot/rwd_tot)*100) %>%
  pivot_longer(cols = c("statefung_of_fungtot", "state_of_tot"), names_to = "variable",
               values_to = "percent")
       
ggplot(state_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~state) +
  scale_x_discrete(name = "Year", 
                   labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
                              "'21", "'22", "23")) +
  scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
                    name = "", labels = c("% of all patients", 
                                          "% of aspergillosis patients")) + 
  theme_classic()

Several states have too few cases or RWD patients to visually compare in this way. Of the states that we can compare, Arizona, California, South Carolina, Kansas for some years, North Carolina for some years, and New Mexico are over represented each year.

g. proportion of aspergillosis patients compared to proportion of demographic

# race groups
race_grp <- left_join(race_grp1, race_grp2, by = "year") %>%
  mutate(racefung_of_racetot = (race_fung_tot/race_rwd_tot)*100) %>%
  pivot_longer(cols = c("racefung_of_racetot"), 
               names_to = "variable",
               values_to = "percent")
       
ggplot(race_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~race) + 
  scale_x_discrete(name = "Year", 
                   labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
                              "'21", "'22", "23")) +
  scale_fill_manual(values = wes_palette("Zissou1", n = 1),
                    name = "", labels = c("% of all race total")) + 
  theme_minimal()

Model Building

1) Data Wrangling

# remove "unknown" as a category
asp_na <- as.data.frame(asp) %>%
  mutate_all(~replace(.,. == "Other or Unknown", NA)) %>%
  mutate_all(~replace(.,. == "Unknown", NA)) %>%
  mutate(state = as.factor(state), year = as.integer(year), gender = as.factor(gender), 
         race = as.factor(race), ethnicity = as.factor(ethnicity), age = as.factor(age), 
         urban = as.factor(urban))

# combine asian/pacific islander to reduce race categories
asp_na$race2 <- recode_factor(asp_na$race, 
                       "American Indian or Alaska Native" = "American Indian or Alaska Native",
                       "Asian" = "Asian/Pacific Islander",
                       "Black or African American" = "Black or African American",
                       "Native Hawaiian or Other Pacific Islander" = "Asian/Pacific Islander",
                       "Some Other Race" = "Other",
                       "White or Caucasian" = "White")

# combine counts for new race categories
asp2 <- asp_na %>%
  group_by(state, year, gender, ethnicity, age, urban, race2) %>%
  summarize(n_fungal = sum(n_fungal),
            n_rwdpts = sum(n_rwdpts)) %>%
  ungroup()
## `summarise()` has grouped output by 'state', 'year', 'gender', 'ethnicity',
## 'age', 'urban'. You can override using the `.groups` argument.
df <- asp2 %>%
  mutate(male = as.factor(case_when(gender == 'Male' ~ 1, TRUE ~ 0)), 
         rural = as.factor(case_when(urban == 'rural' ~ 1, TRUE ~ 0))) %>%
  dplyr::select(state, year, male, race2, ethnicity, rural, age, 
                n_fungal, n_rwdpts) %>%
  filter(n_rwdpts > 0) # remove strata where there are no RWD patients meeting those demographics (n_rwdpts is the offset variable and cannot take the log of 0)

df_re <- asp2 %>%
  mutate(race2 = as.factor(race2), 
         male = as.factor(case_when(gender == 'Male' ~ 1, TRUE ~ 0)), 
         rural = as.factor(case_when(urban == 'rural' ~ 1, TRUE ~ 0)),
         race_eth = case_when(ethnicity == 'Hispanic or Latino' ~ 'Hispanic or Latino', 
                              TRUE ~ race2)) %>%
  dplyr::select(state, year, male, race_eth, rural, age, n_fungal, n_rwdpts) %>%
  filter(n_rwdpts > 0) # remove strata where there are no RWD patients meeting those demographics (n_rwdpts is the offset variable and cannot take the log of 0)

df$race2 <- relevel(df$race2, ref = "White")
df$ethnicity <- relevel(df$ethnicity, ref = "Non-Hispanic")
df$age <- relevel(df$age, ref = "18 to 34")
df_re$race_eth <- as.factor(df_re$race_eth) %>%
  relevel(df_re$race_eth, ref = "White")
df_re$age <- relevel(df$age, ref = "18 to 34")

# covid variables
df$covid <- ifelse(df$year >=2020, 1, 0)
df_re$covid <- ifelse(df$year >= 2020, 1, 0)
df$year_center <- df$year - 2019
df_re$year_center <- df_re$year - 2019
df$year_count <- df$year - 2013
# histogram of number aspergillosis patients
p <- ggplot(data = df, aes(x = n_fungal)) +
  geom_histogram(binwidth = 1, fill = "#69b3a2", alpha = 0.9) +
  theme_minimal()
p

2) Graphs with analytic data

# race groups
race_grp1 <- df %>%
  group_by(year, race2) %>%
  summarize(race_fung_tot = sum(n_fungal), race_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
race_grp2 <- df %>%
  group_by(year) %>%
  summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))

race_grp <- left_join(race_grp1, race_grp2, by = "year") %>%
  mutate(racefung_of_racetot = (race_fung_tot/race_rwd_tot)*100, 
         racefung_of_fungtot = (race_fung_tot/fung_tot)*100,
         race_of_tot = (race_rwd_tot/rwd_tot)*100) %>%
  pivot_longer(cols = c("racefung_of_fungtot", "race_of_tot"), 
               names_to = "variable",
               values_to = "percent")

race_grp_tbl <- race_grp %>%
  group_by(race2) %>%
  summarize(race_fung_tot = sum(race_fung_tot), race_rwd_tot = sum(race_rwd_tot))
       
race_plot <- ggplot(race_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~race2, scales = "free_y") +
  scale_x_discrete(name = "Year", 
                   labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
                              "'21", "'22", "23")) +
  scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
                    name = "", labels = c("% of all patients", 
                                          "% of aspergillosis patients")) + 
  theme_minimal()

race_plot

# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/race_plot.png", plot = race_plot, width = 10, dpi = 300)
# ethnic groups
eth_grp1 <- df %>%
  group_by(year, ethnicity) %>%
  summarize(eth_fung_tot = sum(n_fungal), eth_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
eth_grp2 <- df %>%
  group_by(year) %>%
  summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))

eth_grp <- left_join(eth_grp1, eth_grp2, by = "year") %>%
  mutate(ethfung_of_ethtot = (eth_fung_tot/eth_rwd_tot)*100, 
         ethfung_of_fungtot = (eth_fung_tot/fung_tot)*100,
         eth_of_tot = (eth_rwd_tot/rwd_tot)*100) %>%
  pivot_longer(cols = c("ethfung_of_fungtot", "eth_of_tot"), names_to = "variable",
               values_to = "percent")
       
ethnic_plot <- ggplot(eth_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ethnicity, scales = "free_y",
             labeller = as_labeller(c("Hispanic or Latino" = "Hispanic or Latino", 
                                                  "Non-Hispanic" = "Non-Hispanic or Latino", 
                                                  "Unknown" = "Unknown"))) +
  scale_x_discrete(name = "Year", 
                   labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
                              "'21", "'22", "23")) +
  scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
                    name = "", labels = c("% of all patients", 
                                          "% of aspergillosis patients")) +
  theme_minimal()

ethnic_plot

# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/ethnic_plot.png", plot = ethnic_plot, width = 10, dpi = 300)
# sex/gender groups
sex_grp1 <- df %>%
  group_by(year, male) %>%
  summarize(sex_fung_tot = sum(n_fungal), sex_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
sex_grp2 <- df %>%
  group_by(year) %>%
  summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))

sex_grp <- left_join(sex_grp1, sex_grp2, by = "year") %>%
  mutate(sexfung_of_sextot = (sex_fung_tot/sex_rwd_tot)*100, 
         sexfung_of_fungtot = (sex_fung_tot/fung_tot)*100,
         sex_of_tot = (sex_rwd_tot/rwd_tot)*100) %>%
  pivot_longer(cols = c("sexfung_of_fungtot", "sex_of_tot"), names_to = "variable",
               values_to = "percent")
       
sex_plot <- ggplot(sex_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~male, scales = "free_y",
             labeller = as_labeller(c("0" = "Female", "1" = "Male"))) +
    scale_x_discrete(name = "Year", 
                   labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
                              "'21", "'22", "23")) +
  scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
                    name = "", labels = c("% of all patients", 
                                          "% of aspergillosis patients")) + 
  theme_minimal()

sex_plot

# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/sex_plot.png", plot = sex_plot, width = 8, dpi = 300)
# age groups
age_grp1 <- df %>%
  group_by(year, age) %>%
  summarize(age_fung_tot = sum(n_fungal), age_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
age_grp2 <- df %>%
  group_by(year) %>%
  summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))

age_grp <- left_join(age_grp1, age_grp2, by = "year") %>%
  mutate(agefung_of_agetot = (age_fung_tot/age_rwd_tot)*100, 
         agefung_of_fungtot = (age_fung_tot/fung_tot)*100,
         age_of_tot = (age_rwd_tot/rwd_tot)*100) %>%
  pivot_longer(cols = c("agefung_of_fungtot", "age_of_tot"), names_to = "variable",
               values_to = "percent")
       
age_plot <- ggplot(age_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~age, scales = "free_y") +
    scale_x_discrete(name = "Year", 
                   labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
                              "'21", "'22", "23")) +
  scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
                    name = "", labels = "") + 
  theme_minimal()

age_plot

# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/age_plot.png", plot = age_plot, width = 10, dpi = 300)
# urban/rural
urban_grp1 <- df %>%
  group_by(year, rural) %>%
  summarize(urban_fung_tot = sum(n_fungal), urban_rwd_tot = sum(n_rwdpts))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
urban_grp2 <- df %>%
  group_by(year) %>%
  summarize(fung_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts))

urban_grp <- left_join(urban_grp1, urban_grp2, by = "year") %>%
  mutate(urbanfung_of_urbantot = (urban_fung_tot/urban_rwd_tot)*100, 
         urbanfung_of_fungtot = (urban_fung_tot/fung_tot)*100,
         urban_of_tot = (urban_rwd_tot/rwd_tot)*100) %>%
  pivot_longer(cols = c("urbanfung_of_fungtot", "urban_of_tot"), names_to = "variable",
               values_to = "percent")
       
urban_plot <- ggplot(urban_grp, aes(x = as.factor(year), y = percent, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~rural, scales = "free_y",
             labeller = as_labeller(c("0" = "Urban", "1" = "Rural"))) +
    scale_x_discrete(name = "Year", 
                   labels = c("'13", "'14", "'15", "'16", "'17", "'18", "'19", "'20",
                              "'21", "'22", "23")) +
  scale_fill_manual(values = wes_palette("Darjeeling2", n = 2),
                    name = "", labels = c("% of all patients", 
                                          "% of aspergillosis patients")) + 
  theme_minimal()

urban_plot

# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/urban_plot.png", plot = urban_plot, width = 8, dpi = 300)

3) Univariable models

B) Year

# year
year <- glm(n_fungal ~ year,
            family = poisson,
            offset = log(n_rwdpts),
            data = df)
summary(year)
## 
## Call:
## glm(formula = n_fungal ~ year, family = poisson, data = df, offset = log(n_rwdpts))
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -87.101583   4.857266  -17.93   <2e-16 ***
## year          0.038819   0.002405   16.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 30602  on 133559  degrees of freedom
## Residual deviance: 30336  on 133558  degrees of freedom
## AIC: 46322
## 
## Number of Fisher Scoring iterations: 7

Have quite a few significant years (when compared to 2013)

C) Gender/Sex

# gender/sex
sex <- glm(n_fungal ~ male,
            family = poisson,
            offset = log(n_rwdpts),
            data = df)
summary(sex)
## 
## Call:
## glm(formula = n_fungal ~ male, family = poisson, data = df, offset = log(n_rwdpts))
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.890208   0.009864 -901.29   <2e-16 ***
## male1        0.366160   0.013880   26.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 30602  on 133559  degrees of freedom
## Residual deviance: 29909  on 133558  degrees of freedom
## AIC: 45894
## 
## Number of Fisher Scoring iterations: 7

Females significantly less likely to have aspergillosis diagnosis than males

D) Ethnicity

# ethnicity
eth <- glm(n_fungal ~ ethnicity,
            family = poisson,
            offset = log(n_rwdpts),
            data = df)
summary(eth)
## 
## Call:
## glm(formula = n_fungal ~ ethnicity, family = poisson, data = df, 
##     offset = log(n_rwdpts))
## 
## Coefficients:
##                              Estimate Std. Error   z value Pr(>|z|)    
## (Intercept)                 -8.709450   0.007914 -1100.497   <2e-16 ***
## ethnicityHispanic or Latino -0.026910   0.019351    -1.391    0.164    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 25166  on 90622  degrees of freedom
## Residual deviance: 25164  on 90621  degrees of freedom
##   (42937 observations deleted due to missingness)
## AIC: 39090
## 
## Number of Fisher Scoring iterations: 7

Hispanic or Latino significant (less likely)

E) Race

# race
mrace <- glm(n_fungal ~ race2,
            family = poisson,
            offset = log(n_rwdpts),
            data = df)
summary(mrace)
## 
## Call:
## glm(formula = n_fungal ~ race2, family = poisson, data = df, 
##     offset = log(n_rwdpts))
## 
## Coefficients:
##                                        Estimate Std. Error   z value Pr(>|z|)
## (Intercept)                           -8.720609   0.008077 -1079.702  < 2e-16
## race2American Indian or Alaska Native -0.171455   0.086740    -1.977 0.048080
## race2Asian/Pacific Islander            0.297584   0.038597     7.710 1.26e-14
## race2Black or African American        -0.090271   0.025034    -3.606 0.000311
## race2Other                             0.268391   0.022908    11.716  < 2e-16
##                                          
## (Intercept)                           ***
## race2American Indian or Alaska Native *  
## race2Asian/Pacific Islander           ***
## race2Black or African American        ***
## race2Other                            ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 27442  on 106876  degrees of freedom
## Residual deviance: 27235  on 106872  degrees of freedom
##   (26683 observations deleted due to missingness)
## AIC: 42217
## 
## Number of Fisher Scoring iterations: 8

Asian/Pacific Islander, African-American/Black, and other race groups significant

F) Age

# age
mage <- glm(n_fungal ~ age,
            family = poisson,
            offset = log(n_rwdpts),
            data = df)
summary(mage)
## 
## Call:
## glm(formula = n_fungal ~ age, family = poisson, data = df, offset = log(n_rwdpts))
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -9.66893    0.02219 -435.75   <2e-16 ***
## age35 to 64     0.84553    0.02462   34.34   <2e-16 ***
## age65 and Over  1.48513    0.02435   60.99   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 30602  on 133559  degrees of freedom
## Residual deviance: 25517  on 133557  degrees of freedom
## AIC: 41505
## 
## Number of Fisher Scoring iterations: 8

Both age groups significant compared to 18 to 34 year olds

F) Rural/Urban

# rural
rural <- glm(n_fungal ~ rural,
            family = poisson,
            offset = log(n_rwdpts),
            data = df)
summary(rural)
## 
## Call:
## glm(formula = n_fungal ~ rural, family = poisson, data = df, 
##     offset = log(n_rwdpts))
## 
## Coefficients:
##              Estimate Std. Error   z value Pr(>|z|)    
## (Intercept) -8.696914   0.007561 -1150.264  < 2e-16 ***
## rural1      -0.149385   0.019049    -7.842 4.43e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 30602  on 133559  degrees of freedom
## Residual deviance: 30538  on 133558  degrees of freedom
## AIC: 46523
## 
## Number of Fisher Scoring iterations: 7

Rural significantly less likely to have patients with aspergillosis diagnosis

4) Multivariable models

1) Data Wrangling & Exploration

# remove NAs 
df1 <- na.omit(df)
df_re1 <- na.omit(df_re)

# number of fungal cases with complete data
p <- ggplot(data = df1, aes(x = as.factor(year), y = n_fungal)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = wes_palette("Darjeeling1", n = 1)) + 
  theme_minimal()
p

# number of patients with complete data
df1 %>%
  summarize(n = sum(n_rwdpts))
## # A tibble: 1 × 1
##           n
##       <dbl>
## 1 111203140
# prevalence with complete data
asp_prev_complete <- df1 %>%
  group_by(year) %>%
  summarize(n_tot = sum(n_fungal), rwd_tot = sum(n_rwdpts)) %>%
  mutate(rwd_prev = (n_tot/rwd_tot)*100)

p <- ggplot(data = asp_prev_complete, aes(x = as.factor(year), y = rwd_prev, group = 1)) +
  geom_line() +
  geom_point() +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal()
p

2) Run the Models

pois <- glm(n_fungal ~ race2*covid + ethnicity*covid + rural*covid + male + age + 
              state + year_center*covid,
            family = "poisson",
            offset = log(n_rwdpts),
            data = df1)
logbin <- glm(formula = cbind(n_fungal, n_rwdpts) ~ 
              race2*covid + ethnicity*covid + rural*covid + male + age + 
                state + year_center*covid,
            data = df1,
            family = "binomial"(link = "log"))
qpois <- glm(n_fungal ~ race2*covid + ethnicity*covid + rural*covid + male + age + 
               state + year_center*covid,
            family = "quasipoisson",
            offset = log(n_rwdpts),
            data = df1)
negbin <- glm.nb(n_fungal ~ race2*covid + ethnicity*covid + rural*covid + age + male + 
                   state + year_center*covid + 
                offset(log(n_rwdpts)),
                 data = df1)
Exponentiate coefficients
library(jtools)
summ(pois, exp = T)
## MODEL INFO:
## Observations: 72638
## Dependent Variable: n_fungal
## Type: Generalized linear model
##   Family: poisson 
##   Link function: log 
## 
## MODEL FIT:
## χ²(68) = 9658.12, p = 0.00
## Pseudo-R² (Cragg-Uhler) = 0.32
## Pseudo-R² (McFadden) = 0.27
## AIC = 26549.99, BIC = 27184.33 
## 
## Standard errors: MLE
## -------------------------------------------------------------------------
##                                  exp(Est.)   2.5%   97.5%   z val.      p
## ------------------------------ ----------- ------ ------- -------- ------
## (Intercept)                           0.00   0.00    0.00   -19.31   0.00
## race2American Indian or               0.78   0.59    1.02    -1.81   0.07
## Alaska Native                                                            
## race2Asian/Pacific                    0.93   0.82    1.05    -1.13   0.26
## Islander                                                                 
## race2Black or African                 1.06   0.98    1.14     1.45   0.15
## American                                                                 
## race2Other                            1.25   1.15    1.35     5.41   0.00
## covid                                 1.02   0.95    1.08     0.51   0.61
## ethnicityHispanic or                  0.84   0.79    0.91    -4.68   0.00
## Latino                                                                   
## rural1                                0.86   0.80    0.91    -4.92   0.00
## male1                                 1.37   1.33    1.41    21.38   0.00
## age35 to 64                           2.56   2.42    2.70    34.04   0.00
## age65 and Over                        4.95   4.69    5.23    58.10   0.00
## stateAL                               0.68   0.25    1.83    -0.77   0.44
## stateAR                               0.65   0.21    2.00    -0.74   0.46
## stateAZ                               1.63   0.61    4.35     0.98   0.33
## stateCA                               1.43   0.54    3.81     0.71   0.48
## stateCO                               0.42   0.16    1.14    -1.71   0.09
## stateCT                               0.66   0.25    1.78    -0.81   0.42
## stateDC                               0.57   0.21    1.58    -1.08   0.28
## stateDE                               1.01   0.38    2.70     0.02   0.98
## stateFL                               0.51   0.19    1.35    -1.36   0.17
## stateGA                               1.19   0.44    3.20     0.34   0.73
## stateHI                               1.15   0.42    3.13     0.27   0.78
## stateIA                               0.58   0.22    1.55    -1.09   0.28
## stateID                               0.47   0.15    1.41    -1.35   0.18
## stateIL                               0.45   0.17    1.21    -1.59   0.11
## stateIN                               0.80   0.30    2.13    -0.45   0.65
## stateKS                               0.82   0.30    2.25    -0.38   0.70
## stateKY                               0.83   0.31    2.23    -0.37   0.71
## stateLA                               0.52   0.12    2.33    -0.85   0.39
## stateMA                               0.67   0.25    1.80    -0.79   0.43
## stateMD                               0.51   0.19    1.39    -1.31   0.19
## stateME                               0.53   0.20    1.43    -1.25   0.21
## stateMI                               0.81   0.28    2.36    -0.38   0.70
## stateMN                               0.65   0.23    1.82    -0.83   0.41
## stateMO                               0.63   0.23    1.67    -0.94   0.35
## stateMS                               0.59   0.21    1.64    -1.01   0.31
## stateMT                               0.66   0.24    1.76    -0.84   0.40
## stateNC                               1.68   0.62    4.52     1.03   0.30
## stateND                               1.41   0.41    4.81     0.54   0.59
## stateNE                               0.59   0.22    1.59    -1.04   0.30
## stateNH                               0.68   0.25    1.86    -0.74   0.46
## stateNJ                               0.58   0.22    1.55    -1.08   0.28
## stateNM                               0.60   0.22    1.63    -1.00   0.32
## stateNV                               0.25   0.08    0.79    -2.35   0.02
## stateNY                               0.78   0.29    2.07    -0.51   0.61
## stateOH                               0.41   0.15    1.11    -1.75   0.08
## stateOK                               0.57   0.15    2.11    -0.85   0.40
## stateOR                               0.64   0.24    1.73    -0.88   0.38
## statePA                               0.63   0.24    1.68    -0.93   0.35
## stateRI                               4.97   1.60   15.40     2.78   0.01
## stateSC                               1.00   0.37    2.68     0.00   1.00
## stateSD                               0.96   0.33    2.77    -0.08   0.94
## stateTN                               0.59   0.22    1.57    -1.06   0.29
## stateTX                               0.50   0.19    1.34    -1.37   0.17
## stateUT                               0.08   0.02    0.30    -3.76   0.00
## stateVA                               0.75   0.28    2.01    -0.58   0.56
## stateVT                               0.43   0.15    1.18    -1.64   0.10
## stateWA                               0.29   0.10    0.81    -2.34   0.02
## stateWI                               0.64   0.24    1.71    -0.90   0.37
## stateWV                               0.63   0.24    1.69    -0.91   0.36
## stateWY                               0.49   0.18    1.35    -1.38   0.17
## year_center                           1.05   1.04    1.07     9.39   0.00
## race2American Indian or               1.15   0.80    1.65     0.77   0.44
## Alaska Native:covid                                                      
## race2Asian/Pacific                    1.28   1.09    1.50     3.08   0.00
## Islander:covid                                                           
## race2Black or African                 1.04   0.94    1.16     0.83   0.41
## American:covid                                                           
## race2Other:covid                      1.00   0.90    1.11    -0.01   0.99
## covid:ethnicityHispanic or            1.18   1.08    1.30     3.48   0.00
## Latino                                                                   
## covid:rural1                          1.06   0.98    1.15     1.40   0.16
## covid:year_center                     0.95   0.93    0.97    -4.89   0.00
## -------------------------------------------------------------------------
summ(logbin, exp = T)
## MODEL INFO:
## Observations: 72638
## Dependent Variable: cbind(n_fungal, n_rwdpts)
## Type: Generalized linear model
##   Family: binomial 
##   Link function: log 
## 
## MODEL FIT:
## χ²(68) = 9656.22, p = 0.00
## Pseudo-R² (Cragg-Uhler) = 0.32
## Pseudo-R² (McFadden) = 0.27
## AIC = 26536.39, BIC = 27170.72 
## 
## Standard errors: MLE
## -------------------------------------------------------------------------
##                                  exp(Est.)   2.5%   97.5%   z val.      p
## ------------------------------ ----------- ------ ------- -------- ------
## (Intercept)                           0.00   0.00    0.00   -19.32   0.00
## race2American Indian or               0.78   0.59    1.02    -1.81   0.07
## Alaska Native                                                            
## race2Asian/Pacific                    0.93   0.82    1.05    -1.13   0.26
## Islander                                                                 
## race2Black or African                 1.06   0.98    1.14     1.45   0.15
## American                                                                 
## race2Other                            1.25   1.15    1.35     5.41   0.00
## covid                                 1.02   0.95    1.08     0.51   0.61
## ethnicityHispanic or                  0.84   0.79    0.91    -4.68   0.00
## Latino                                                                   
## rural1                                0.86   0.80    0.91    -4.92   0.00
## male1                                 1.37   1.33    1.41    21.38   0.00
## age35 to 64                           2.56   2.42    2.70    34.04   0.00
## age65 and Over                        4.95   4.69    5.23    58.09   0.00
## stateAL                               0.68   0.25    1.83    -0.77   0.44
## stateAR                               0.65   0.21    2.00    -0.74   0.46
## stateAZ                               1.63   0.61    4.35     0.98   0.33
## stateCA                               1.43   0.54    3.81     0.71   0.48
## stateCO                               0.42   0.16    1.14    -1.71   0.09
## stateCT                               0.66   0.25    1.78    -0.81   0.42
## stateDC                               0.57   0.21    1.58    -1.08   0.28
## stateDE                               1.01   0.38    2.70     0.02   0.98
## stateFL                               0.51   0.19    1.35    -1.36   0.17
## stateGA                               1.19   0.44    3.20     0.34   0.73
## stateHI                               1.15   0.42    3.13     0.27   0.78
## stateIA                               0.58   0.22    1.55    -1.09   0.28
## stateID                               0.47   0.15    1.41    -1.35   0.18
## stateIL                               0.45   0.17    1.21    -1.59   0.11
## stateIN                               0.80   0.30    2.13    -0.45   0.65
## stateKS                               0.82   0.30    2.25    -0.38   0.70
## stateKY                               0.83   0.31    2.22    -0.37   0.71
## stateLA                               0.52   0.12    2.33    -0.85   0.39
## stateMA                               0.67   0.25    1.80    -0.79   0.43
## stateMD                               0.51   0.19    1.39    -1.31   0.19
## stateME                               0.53   0.20    1.43    -1.25   0.21
## stateMI                               0.81   0.28    2.36    -0.38   0.70
## stateMN                               0.65   0.23    1.82    -0.83   0.41
## stateMO                               0.63   0.23    1.67    -0.94   0.35
## stateMS                               0.59   0.21    1.64    -1.01   0.31
## stateMT                               0.66   0.24    1.76    -0.84   0.40
## stateNC                               1.68   0.62    4.52     1.03   0.30
## stateND                               1.41   0.41    4.81     0.54   0.59
## stateNE                               0.59   0.22    1.59    -1.04   0.30
## stateNH                               0.68   0.25    1.86    -0.74   0.46
## stateNJ                               0.58   0.22    1.55    -1.08   0.28
## stateNM                               0.60   0.22    1.63    -1.00   0.32
## stateNV                               0.25   0.08    0.79    -2.35   0.02
## stateNY                               0.78   0.29    2.07    -0.51   0.61
## stateOH                               0.41   0.15    1.11    -1.75   0.08
## stateOK                               0.57   0.15    2.11    -0.85   0.40
## stateOR                               0.64   0.24    1.73    -0.88   0.38
## statePA                               0.63   0.24    1.68    -0.93   0.35
## stateRI                               4.96   1.60   15.38     2.77   0.01
## stateSC                               1.00   0.37    2.68     0.00   1.00
## stateSD                               0.96   0.33    2.77    -0.08   0.94
## stateTN                               0.59   0.22    1.57    -1.06   0.29
## stateTX                               0.50   0.19    1.34    -1.37   0.17
## stateUT                               0.08   0.02    0.30    -3.76   0.00
## stateVA                               0.75   0.28    2.01    -0.58   0.56
## stateVT                               0.43   0.15    1.18    -1.64   0.10
## stateWA                               0.29   0.10    0.81    -2.35   0.02
## stateWI                               0.64   0.24    1.71    -0.90   0.37
## stateWV                               0.63   0.24    1.69    -0.91   0.36
## stateWY                               0.49   0.18    1.35    -1.38   0.17
## year_center                           1.05   1.04    1.07     9.39   0.00
## race2American Indian or               1.15   0.80    1.65     0.77   0.44
## Alaska Native:covid                                                      
## race2Asian/Pacific                    1.28   1.09    1.50     3.08   0.00
## Islander:covid                                                           
## race2Black or African                 1.04   0.94    1.16     0.83   0.41
## American:covid                                                           
## race2Other:covid                      1.00   0.90    1.11    -0.01   0.99
## covid:ethnicityHispanic or            1.18   1.08    1.30     3.48   0.00
## Latino                                                                   
## covid:rural1                          1.06   0.98    1.15     1.40   0.16
## covid:year_center                     0.95   0.93    0.97    -4.89   0.00
## -------------------------------------------------------------------------
Compare AIC values to help with model selection
tmp <- data.frame(pois = AIC(pois), logbin = AIC(logbin), negb = AIC(negbin))
knitr::kable(tmp, align = 'l')
pois logbin negb
26549.99 26536.39 26363.42

Negative binomial lower than Poisson, but not by much - will check dispersion

library(AER)
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
dispersiontest(pois)
## 
##  Overdispersion test
## 
## data:  pois
## z = 6.3161, p-value = 1.341e-10
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##    1.03155

Check theta for negative binomial

summary(negbin)$theta
## [1] 13.94523

Likeliehood ratio test

library(lmtest)
lrtest(pois, negbin)
## Likelihood ratio test
## 
## Model 1: n_fungal ~ race2 * covid + ethnicity * covid + rural * covid + 
##     male + age + state + year_center * covid
## Model 2: n_fungal ~ race2 * covid + ethnicity * covid + rural * covid + 
##     age + male + state + year_center * covid + offset(log(n_rwdpts))
##   #Df LogLik Df  Chisq Pr(>Chisq)    
## 1  69 -13206                         
## 2  70 -13112  1 188.58  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Poisson isn’t too overdispersed, but negative binomial fits better. Negative binomial has a high theta - will use Quasi-Poisson as a middle ground.

3) Robust SEs

# Extract model coefficients
regression_results <- tidy(qpois)

# Compute robust standard errors
library(sandwich)
robust_se <- vcovHC(qpois, type = "HC0")

# Extract test statistics and coefficients with robust SEs
regression_results <- coeftest(qpois, vcov. = robust_se)

# combine into dataframe
robust_results <- data.frame(
  Term = rownames(regression_results),
  Estimate = regression_results[, 1],
  Robust_SE = regression_results[, 2],
  z_value = regression_results[, 3],
  p_values = regression_results[, 4]
)

# add 95% confidence intervals
z_value <- 1.96
robust_results$conf.low <- robust_results$Estimate - z_value * robust_results$Robust_SE
robust_results$conf.high <- robust_results$Estimate + z_value * robust_results$Robust_SE


print(robust_results)
##                                                                                    Term
## (Intercept)                                                                 (Intercept)
## race2American Indian or Alaska Native             race2American Indian or Alaska Native
## race2Asian/Pacific Islander                                 race2Asian/Pacific Islander
## race2Black or African American                           race2Black or African American
## race2Other                                                                   race2Other
## covid                                                                             covid
## ethnicityHispanic or Latino                                 ethnicityHispanic or Latino
## rural1                                                                           rural1
## male1                                                                             male1
## age35 to 64                                                                 age35 to 64
## age65 and Over                                                           age65 and Over
## stateAL                                                                         stateAL
## stateAR                                                                         stateAR
## stateAZ                                                                         stateAZ
## stateCA                                                                         stateCA
## stateCO                                                                         stateCO
## stateCT                                                                         stateCT
## stateDC                                                                         stateDC
## stateDE                                                                         stateDE
## stateFL                                                                         stateFL
## stateGA                                                                         stateGA
## stateHI                                                                         stateHI
## stateIA                                                                         stateIA
## stateID                                                                         stateID
## stateIL                                                                         stateIL
## stateIN                                                                         stateIN
## stateKS                                                                         stateKS
## stateKY                                                                         stateKY
## stateLA                                                                         stateLA
## stateMA                                                                         stateMA
## stateMD                                                                         stateMD
## stateME                                                                         stateME
## stateMI                                                                         stateMI
## stateMN                                                                         stateMN
## stateMO                                                                         stateMO
## stateMS                                                                         stateMS
## stateMT                                                                         stateMT
## stateNC                                                                         stateNC
## stateND                                                                         stateND
## stateNE                                                                         stateNE
## stateNH                                                                         stateNH
## stateNJ                                                                         stateNJ
## stateNM                                                                         stateNM
## stateNV                                                                         stateNV
## stateNY                                                                         stateNY
## stateOH                                                                         stateOH
## stateOK                                                                         stateOK
## stateOR                                                                         stateOR
## statePA                                                                         statePA
## stateRI                                                                         stateRI
## stateSC                                                                         stateSC
## stateSD                                                                         stateSD
## stateTN                                                                         stateTN
## stateTX                                                                         stateTX
## stateUT                                                                         stateUT
## stateVA                                                                         stateVA
## stateVT                                                                         stateVT
## stateWA                                                                         stateWA
## stateWI                                                                         stateWI
## stateWV                                                                         stateWV
## stateWY                                                                         stateWY
## year_center                                                                 year_center
## race2American Indian or Alaska Native:covid race2American Indian or Alaska Native:covid
## race2Asian/Pacific Islander:covid                     race2Asian/Pacific Islander:covid
## race2Black or African American:covid               race2Black or African American:covid
## race2Other:covid                                                       race2Other:covid
## covid:ethnicityHispanic or Latino                     covid:ethnicityHispanic or Latino
## covid:rural1                                                               covid:rural1
## covid:year_center                                                     covid:year_center
##                                                  Estimate   Robust_SE
## (Intercept)                                 -9.6743739582 0.470612554
## race2American Indian or Alaska Native       -0.2532650010 0.151273266
## race2Asian/Pacific Islander                 -0.0703069117 0.065412069
## race2Black or African American               0.0557086992 0.045369077
## race2Other                                   0.2194497636 0.047317444
## covid                                        0.0167282702 0.039765956
## ethnicityHispanic or Latino                 -0.1696088898 0.041652133
## rural1                                      -0.1561064829 0.035007879
## male1                                        0.3148675579 0.018243694
## age35 to 64                                  0.9393316989 0.032824259
## age65 and Over                               1.5998037177 0.032250863
## stateAL                                     -0.3929021527 0.479904883
## stateAR                                     -0.4254889516 0.540706994
## stateAZ                                      0.4894299148 0.469887065
## stateCA                                      0.3570819605 0.469537814
## stateCO                                     -0.8684757307 0.476527163
## stateCT                                     -0.4084976797 0.472351297
## stateDC                                     -0.5579168015 0.487412786
## stateDE                                      0.0097208047 0.472581687
## stateFL                                     -0.6828487635 0.473466526
## stateGA                                      0.1730360096 0.473665453
## stateHI                                      0.1402366510 0.480667044
## stateIA                                     -0.5488381444 0.473827132
## stateID                                     -0.7644695807 0.539450031
## stateIL                                     -0.8023579897 0.473894432
## stateIN                                     -0.2241826710 0.470124400
## stateKS                                     -0.1972103963 0.484760117
## stateKY                                     -0.1874848292 0.473811433
## stateLA                                     -0.6499887778 0.745213634
## stateMA                                     -0.3977089041 0.473102698
## stateMD                                     -0.6734540895 0.483482809
## stateME                                     -0.6310924906 0.472892500
## stateMI                                     -0.2070061209 0.513752910
## stateMN                                     -0.4356260918 0.491831289
## stateMO                                     -0.4693362447 0.471096403
## stateMS                                     -0.5244278805 0.490056983
## stateMT                                     -0.4214675740 0.473556054
## stateNC                                      0.5186884192 0.481841700
## stateND                                      0.3414511638 0.627327281
## stateNE                                     -0.5236689415 0.474960238
## stateNH                                     -0.3786500582 0.482400013
## stateNJ                                     -0.5457240709 0.474127909
## stateNM                                     -0.5104445482 0.480519807
## stateNV                                     -1.3739317385 0.550869149
## stateNY                                     -0.2548073744 0.470525780
## stateOH                                     -0.8847188783 0.475777942
## stateOK                                     -0.5702655433 0.639763489
## stateOR                                     -0.4465410131 0.475379668
## statePA                                     -0.4657807985 0.470602631
## stateRI                                      1.6029002877 0.569018770
## stateSC                                      0.0008069862 0.473786397
## stateSD                                     -0.0417943517 0.522056578
## stateTN                                     -0.5326122858 0.474391187
## stateTX                                     -0.6895774005 0.472222801
## stateUT                                     -2.5228856034 0.640088493
## stateVA                                     -0.2914993159 0.477969399
## stateVT                                     -0.8509490293 0.483173816
## stateWA                                     -1.2507620547 0.504706242
## stateWI                                     -0.4527100029 0.475831621
## stateWV                                     -0.4590467532 0.472110386
## stateWY                                     -0.7151720435 0.491380402
## year_center                                  0.0530192076 0.006823566
## race2American Indian or Alaska Native:covid  0.1413055255 0.192943375
## race2Asian/Pacific Islander:covid            0.2491547529 0.081789780
## race2Black or African American:covid         0.0436001682 0.067393929
## race2Other:covid                            -0.0007085809 0.067140078
## covid:ethnicityHispanic or Latino            0.1661129490 0.059889128
## covid:rural1                                 0.0580635678 0.046409306
## covid:year_center                           -0.0529290078 0.013641602
##                                                  z_value      p_values
## (Intercept)                                 -20.55698235  6.665151e-94
## race2American Indian or Alaska Native        -1.67422181  9.408701e-02
## race2Asian/Pacific Islander                  -1.07483087  2.824505e-01
## race2Black or African American                1.22790021  2.194844e-01
## race2Other                                    4.63781944  3.521042e-06
## covid                                         0.42066813  6.739974e-01
## ethnicityHispanic or Latino                  -4.07203366  4.660444e-05
## rural1                                       -4.45918139  8.227327e-06
## male1                                        17.25898066  9.578693e-67
## age35 to 64                                  28.61699649 4.129115e-180
## age65 and Over                               49.60498880  0.000000e+00
## stateAL                                      -0.81870839  4.129528e-01
## stateAR                                      -0.78691224  4.313332e-01
## stateAZ                                       1.04159053  2.976016e-01
## stateCA                                       0.76049670  4.469577e-01
## stateCO                                      -1.82251044  6.837757e-02
## stateCT                                      -0.86481752  3.871390e-01
## stateDC                                      -1.14464950  2.523544e-01
## stateDE                                       0.02056958  9.835890e-01
## stateFL                                      -1.44223240  1.492368e-01
## stateGA                                       0.36531271  7.148780e-01
## stateHI                                       0.29175425  7.704745e-01
## stateIA                                      -1.15830882  2.467380e-01
## stateID                                      -1.41712770  1.564456e-01
## stateIL                                      -1.69311546  9.043349e-02
## stateIN                                      -0.47685819  6.334631e-01
## stateKS                                      -0.40682059  6.841398e-01
## stateKY                                      -0.39569503  6.923300e-01
## stateLA                                      -0.87221804  3.830894e-01
## stateMA                                      -0.84063969  4.005498e-01
## stateMD                                      -1.39292251  1.636432e-01
## stateME                                      -1.33453690  1.820280e-01
## stateMI                                      -0.40292934  6.870002e-01
## stateMN                                      -0.88572261  3.757670e-01
## stateMO                                      -0.99626370  3.191220e-01
## stateMS                                      -1.07013653  2.845579e-01
## stateMT                                      -0.89000567  3.734628e-01
## stateNC                                       1.07647059  2.817168e-01
## stateND                                       0.54429510  5.862384e-01
## stateNE                                      -1.10255322  2.702212e-01
## stateNH                                      -0.78492962  4.324948e-01
## stateNJ                                      -1.15100601  2.497298e-01
## stateNM                                      -1.06227577  2.881105e-01
## stateNV                                      -2.49411633  1.262712e-02
## stateNY                                      -0.54153754  5.881371e-01
## stateOH                                      -1.85952059  6.295338e-02
## stateOK                                      -0.89136932  3.727311e-01
## stateOR                                      -0.93933553  3.475585e-01
## statePA                                      -0.98975392  3.222944e-01
## stateRI                                       2.81695503  4.848131e-03
## stateSC                                       0.00170327  9.986410e-01
## stateSD                                      -0.08005713  9.361918e-01
## stateTN                                      -1.12272804  2.615530e-01
## stateTX                                      -1.46027977  1.442132e-01
## stateUT                                      -3.94146377  8.098587e-05
## stateVA                                      -0.60987025  5.419478e-01
## stateVT                                      -1.76116544  7.821040e-02
## stateWA                                      -2.47819811  1.320478e-02
## stateWI                                      -0.95140798  3.413973e-01
## stateWV                                      -0.97232928  3.308868e-01
## stateWY                                      -1.45543461  1.455490e-01
## year_center                                   7.77001444  7.847718e-15
## race2American Indian or Alaska Native:covid   0.73236785  4.639441e-01
## race2Asian/Pacific Islander:covid             3.04628221  2.316903e-03
## race2Black or African American:covid          0.64694504  5.176675e-01
## race2Other:covid                             -0.01055377  9.915795e-01
## covid:ethnicityHispanic or Latino             2.77367455  5.542710e-03
## covid:rural1                                  1.25111908  2.108910e-01
## covid:year_center                            -3.87997020  1.044693e-04
##                                                 conf.low   conf.high
## (Intercept)                                 -10.59677456 -8.75197335
## race2American Indian or Alaska Native        -0.54976060  0.04323060
## race2Asian/Pacific Islander                  -0.19851457  0.05790074
## race2Black or African American               -0.03321469  0.14463209
## race2Other                                    0.12670757  0.31219195
## covid                                        -0.06121300  0.09466954
## ethnicityHispanic or Latino                  -0.25124707 -0.08797071
## rural1                                       -0.22472193 -0.08749104
## male1                                         0.27910992  0.35062520
## age35 to 64                                   0.87499615  1.00366725
## age65 and Over                                1.53659203  1.66301541
## stateAL                                      -1.33351572  0.54771142
## stateAR                                      -1.48527466  0.63429676
## stateAZ                                      -0.43154873  1.41040856
## stateCA                                      -0.56321216  1.27737608
## stateCO                                      -1.80246897  0.06551751
## stateCT                                      -1.33430622  0.51731086
## stateDC                                      -1.51324586  0.39741226
## stateDE                                      -0.91653930  0.93598091
## stateFL                                      -1.61084315  0.24514563
## stateGA                                      -0.75534828  1.10142030
## stateHI                                      -0.80187075  1.08234406
## stateIA                                      -1.47753932  0.37986303
## stateID                                      -1.82179164  0.29285248
## stateIL                                      -1.73119108  0.12647510
## stateIN                                      -1.14562650  0.69726115
## stateKS                                      -1.14734022  0.75291943
## stateKY                                      -1.11615524  0.74118558
## stateLA                                      -2.11060750  0.81062994
## stateMA                                      -1.32499019  0.52957238
## stateMD                                      -1.62108039  0.27417222
## stateME                                      -1.55796179  0.29577681
## stateMI                                      -1.21396182  0.79994958
## stateMN                                      -1.39961542  0.52836323
## stateMO                                      -1.39268520  0.45401271
## stateMS                                      -1.48493957  0.43608381
## stateMT                                      -1.34963744  0.50670229
## stateNC                                      -0.42572131  1.46309815
## stateND                                      -0.88811031  1.57101263
## stateNE                                      -1.45459101  0.40725312
## stateNH                                      -1.32415408  0.56685397
## stateNJ                                      -1.47501477  0.38356663
## stateNM                                      -1.45226337  0.43137427
## stateNV                                      -2.45363527 -0.29422821
## stateNY                                      -1.17703790  0.66742315
## stateOH                                      -1.81724364  0.04780589
## stateOK                                      -1.82420198  0.68367089
## stateOR                                      -1.37828516  0.48520314
## statePA                                      -1.38816196  0.45660036
## stateRI                                       0.48762350  2.71817708
## stateSC                                      -0.92781435  0.92942832
## stateSD                                      -1.06502525  0.98143654
## stateTN                                      -1.46241901  0.39719444
## stateTX                                      -1.61513409  0.23597929
## stateUT                                      -3.77745905 -1.26831216
## stateVA                                      -1.22831934  0.64532071
## stateVT                                      -1.79796971  0.09607165
## stateWA                                      -2.23998629 -0.26153782
## stateWI                                      -1.38533998  0.47991997
## stateWV                                      -1.38438311  0.46628960
## stateWY                                      -1.67827763  0.24793354
## year_center                                   0.03964502  0.06639340
## race2American Indian or Alaska Native:covid  -0.23686349  0.51947454
## race2Asian/Pacific Islander:covid             0.08884678  0.40946272
## race2Black or African American:covid         -0.08849193  0.17569227
## race2Other:covid                             -0.13230313  0.13088597
## covid:ethnicityHispanic or Latino             0.04873026  0.28349564
## covid:rural1                                 -0.03289867  0.14902581
## covid:year_center                            -0.07966655 -0.02619147

4) Post-COVID Slope & 95% CI

library(multcomp)
## Loading required package: mvtnorm
## 
## Attaching package: 'mvtnorm'
## The following object is masked from 'package:jtools':
## 
##     standardize
## Loading required package: TH.data
## 
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
## 
##     geyser
# set up named linear combination for the post-COVID slope
K_named <- setNames(rep(0, length(coef(qpois))), names(coef(qpois)))
K_named["year_center"] <- 1
K_named["covid:year_center"] <- 1  

post_slope_glht <- glht(qpois, linfct = rbind(K_named))

# summary and confidence interval
summary(post_slope_glht)
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: glm(formula = n_fungal ~ race2 * covid + ethnicity * covid + 
##     rural * covid + male + age + state + year_center * covid, 
##     family = "quasipoisson", data = df1, offset = log(n_rwdpts))
## 
## Linear Hypotheses:
##               Estimate Std. Error z value Pr(>|z|)
## K_named == 0 0.0000902  0.0076414   0.012    0.991
## (Adjusted p values reported -- single-step method)
confint(post_slope_glht)
## 
##   Simultaneous Confidence Intervals
## 
## Fit: glm(formula = n_fungal ~ race2 * covid + ethnicity * covid + 
##     rural * covid + male + age + state + year_center * covid, 
##     family = "quasipoisson", data = df1, offset = log(n_rwdpts))
## 
## Quantile = 1.96
## 95% family-wise confidence level
##  
## 
## Linear Hypotheses:
##              Estimate   lwr        upr       
## K_named == 0  0.0000902 -0.0148867  0.0150671
# robust SE
vcov_robust <- vcovHC(qpois, type = "HC0")
glht(qpois, linfct = rbind(K_named), vcov = vcov_robust)
## 
##   General Linear Hypotheses
## 
## Linear Hypotheses:
##              Estimate
## K_named == 0 9.02e-05

5) Plot Regression Results

# RR instead of log, remove state and year variables
robust_results <- robust_results %>%
  mutate(RR = exp(Estimate),
         conf.low = exp(conf.low),
         conf.high = exp(conf.high)) %>%
  filter(str_detect(Term, "Intercept|race2|ethnicity|rural|male|age35|age65|covid|year"))

p_forest_regression <- ggplot(robust_results, aes(x = reorder(Term, RR), 
                                                  y = RR, ymin = conf.low, 
                                                  ymax = conf.high)) +
  geom_pointrange(size = 1.2, color = "black") +  # No p-value coloring
  geom_hline(yintercept = 1, linetype = "dashed", color = "red") +  
  coord_flip() +  
  theme_minimal() +
  labs(y = "Rate Ratio (RR)", x = "Predictors",
       title = "Estimated Rate Ratios with Robust Standard Errors",
       subtitle = "Confidence Intervals for Each Predictor") 

p_forest_regression

5) Interactions

Did the COVID pandemic deferentially affect aspergillosis diagnosis rates among certain demographic subgroups?

Contrasts

Estimate marginal means for interaction between race, ethnicity, rurality, and covid

library(emmeans)
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
# get ratios for different groups
emm_options(rg.limit = 50000)
emm <- emmeans(qpois, ~ race2*ethnicity*rural*covid,
               at = list(year_center = 0),
               type = "response")

# compare Post- vs. Pre-COVID within each group
contrast_results <- contrast(emm, "revpairwise", by = c("race2", "ethnicity", "rural"), 
                             adjust = "sidak")

contrast_summary <- summary(contrast_results, infer = c(TRUE, TRUE), type = "response")

contrast_df <- as.data.frame(contrast_summary) %>%
  mutate(rural = ifelse(rural == "0", "urban", "rural"), # replace binary indicator with label
         group = paste(race2, ethnicity, rural, sep = " - "), # combine groups with separators
         p.value = signif(p.value, 3)) # format p-values to 3 digits

If RR > 1, aspergillosis rate increased post-COVID
If RR < 1, aspergillosis rate decreased post-COVID
~ 1, the rate stayed the same

Visualize results

p_forest <- ggplot(contrast_df, aes(x = group, y = ratio, ymin = asymp.LCL, 
                                    ymax = asymp.UCL, color = p.value < 0.05)) +
  geom_pointrange(size = 1.2) + # confidence intervals
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") + # reference line at no change (0)
  coord_flip() +
  theme_minimal() +
  labs(y = "Change in Rate Ratio (Post-COVID vs. Pre-COVID)", x = "Group", 
       title = "Post- vs. Pre-COVID Aspergillosis Rate Ratio Changes",
       subtitle = "Statistically significant comparisons (p < 0.05) are highlighted") +
  scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black"))

p_forest

Average for rural patients

contrast_df_na <- na.omit(contrast_df)

contrast_df_rural <- contrast_df_na %>%
  group_by(rural) %>%
  summarize(
    avg_ratio = mean(ratio),
    conf_low = mean(asymp.LCL),
    conf.high = mean(asymp.UCL)
  )

Boxplot - rural

p_box <- ggplot(contrast_df, aes(x = race2, y = ratio, fill = rural)) +
  geom_boxplot(width = 0.4, position = position_dodge(width = 0.5)) +
  theme_minimal() +
  labs(y = "aPR Change", x = "Racial Group",
       title = "Comparison of PR Changes Across Race & Rural Status",
       legend.title = "Urbanicity",
       fill = "Residence Type") +
  scale_fill_manual(values = c("rural" = "#D67236", 
                               "urban" = "#EAD3BF"))

p_box

# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/rural_box.png", plot = p_box, width = 12, dpi = 300)

Average over race group to get ethnicity total

contrast_df_eth <- contrast_df %>%
  group_by(ethnicity) %>%
  summarize(
    avg_ratio = mean(ratio),
    conf_low = mean(asymp.LCL),
    conf.high = mean(asymp.UCL)
  )

Boxplot - race and ethnicity

p_box3 <- ggplot(contrast_df, aes(x = race2, y = ratio, fill = ethnicity)) +
  geom_boxplot(width = 0.4, position = position_dodge(width = 0.2)) +
  theme_minimal() +
  labs(y = "aPR Change", x = "Racial Group",
       title = "Comparison of PR Changes Across Race & Ethnicity Status",
       fill = "Ethnicity") +
  scale_fill_manual(values = c("Non-Hispanic" = "#E3B710", 
                               "Hispanic or Latino" = "#54A5B9"),
                    labels = c("Non-Hispanic" = "Non-Hispanic or Latino",
                               "Hispanic or Latino" = "Hispanic or Latino"))

p_box3

# ggsave("/Users/brittanymorgan/Desktop/Cerner Data/Aspergillosis_3_6_25/Plots/ethnicity_box.png", plot = p_box3, width = 12, dpi = 300)

Sort by RR change

contrast_df <- contrast_df %>%
  arrange(desc(ratio))

Test for trends across race groups, controlling for ethnicity

lm_model <- lm(ratio ~ race2 + ethnicity, data = contrast_df)
summary(lm_model)
## 
## Call:
## lm(formula = ratio ~ race2 + ethnicity, data = contrast_df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.056712 -0.037101  0.000015  0.033633  0.063758 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            1.0382075  0.0249027  41.691 4.37e-16
## race2American Indian or Alaska Native  0.1733118  0.0321492   5.391 9.52e-05
## race2Asian/Pacific Islander            0.3230865  0.0321492  10.050 8.79e-08
## race2Black or African American         0.0508878  0.0321492   1.583    0.136
## race2Other                            -0.0008088  0.0321492  -0.025    0.980
## ethnicityHispanic or Latino            0.2073612  0.0203330  10.198 7.32e-08
##                                          
## (Intercept)                           ***
## race2American Indian or Alaska Native ***
## race2Asian/Pacific Islander           ***
## race2Black or African American           
## race2Other                               
## ethnicityHispanic or Latino           ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04547 on 14 degrees of freedom
## Multiple R-squared:  0.9477, Adjusted R-squared:  0.929 
## F-statistic: 50.71 on 5 and 14 DF,  p-value: 1.783e-08

American Indian or Alaska Native and Asian/Pacific Islander had higher rate ratios (17% and 32%, respectively) than White individuals. Hispanic or Latino ethnicity had higher rate ratios (21%).

6) Tune Model

Check Fit

# check residual deviance
deviance(qpois) / df.residual(qpois)
## [1] 0.1846822

Likelihood ratio test

lrtest(qpois, negbin)
## Likelihood ratio test
## 
## Model 1: n_fungal ~ race2 * covid + ethnicity * covid + rural * covid + 
##     male + age + state + year_center * covid
## Model 2: n_fungal ~ race2 * covid + ethnicity * covid + rural * covid + 
##     age + male + state + year_center * covid + offset(log(n_rwdpts))
##   #Df LogLik Df Chisq Pr(>Chisq)
## 1  70                           
## 2  70 -13112  0

Check VIF
VIF does not depend on response variable distribution, so can be computed from a Poisson model and applied to QPoisson

# create model matrix (excluding intercept)
X <- model.matrix(~ race2 * covid + ethnicity * covid + rural * covid + male + state, data = df1)[, -1]

# compute Variance Inflation Factor manually
vif_manual <- diag(solve(cor(X)))  # Inverse of correlation matrix

print(vif_manual)
##       race2American Indian or Alaska Native 
##                                    2.190987 
##                 race2Asian/Pacific Islander 
##                                    2.235495 
##              race2Black or African American 
##                                    2.268267 
##                                  race2Other 
##                                    2.291185 
##                                       covid 
##                                    4.940847 
##                 ethnicityHispanic or Latino 
##                                    1.690401 
##                                      rural1 
##                                    1.683494 
##                                       male1 
##                                    1.002470 
##                                     stateAL 
##                                    2.312752 
##                                     stateAR 
##                                    1.934501 
##                                     stateAZ 
##                                    2.860342 
##                                     stateCA 
##                                    3.059482 
##                                     stateCO 
##                                    2.465622 
##                                     stateCT 
##                                    2.564149 
##                                     stateDC 
##                                    1.886334 
##                                     stateDE 
##                                    2.320816 
##                                     stateFL 
##                                    2.607875 
##                                     stateGA 
##                                    2.316290 
##                                     stateHI 
##                                    2.096909 
##                                     stateIA 
##                                    2.528445 
##                                     stateID 
##                                    1.939010 
##                                     stateIL 
##                                    2.552520 
##                                     stateIN 
##                                    2.644641 
##                                     stateKS 
##                                    2.217563 
##                                     stateKY 
##                                    2.380409 
##                                     stateLA 
##                                    1.699608 
##                                     stateMA 
##                                    2.550047 
##                                     stateMD 
##                                    2.366866 
##                                     stateME 
##                                    2.477718 
##                                     stateMI 
##                                    1.971938 
##                                     stateMN 
##                                    2.096119 
##                                     stateMO 
##                                    2.730403 
##                                     stateMS 
##                                    2.148887 
##                                     stateMT 
##                                    2.350814 
##                                     stateNC 
##                                    2.199568 
##                                     stateND 
##                                    1.725493 
##                                     stateNE 
##                                    2.446277 
##                                     stateNH 
##                                    2.061078 
##                                     stateNJ 
##                                    2.598385 
##                                     stateNM 
##                                    2.605087 
##                                     stateNV 
##                                    2.206400 
##                                     stateNY 
##                                    2.641685 
##                                     stateOH 
##                                    2.526006 
##                                     stateOK 
##                                    1.921465 
##                                     stateOR 
##                                    2.450313 
##                                     statePA 
##                                    2.584870 
##                                     stateRI 
##                                    1.513530 
##                                     stateSC 
##                                    2.446383 
##                                     stateSD 
##                                    2.001837 
##                                     stateTN 
##                                    2.204163 
##                                     stateTX 
##                                    2.755818 
##                                     stateUT 
##                                    2.174149 
##                                     stateVA 
##                                    2.410477 
##                                     stateVT 
##                                    1.974936 
##                                     stateWA 
##                                    2.232801 
##                                     stateWI 
##                                    2.355739 
##                                     stateWV 
##                                    2.226638 
##                                     stateWY 
##                                    2.185249 
## race2American Indian or Alaska Native:covid 
##                                    2.444151 
##           race2Asian/Pacific Islander:covid 
##                                    2.537198 
##        race2Black or African American:covid 
##                                    2.613060 
##                            race2Other:covid 
##                                    2.688884 
##           covid:ethnicityHispanic or Latino 
##                                    2.381220 
##                                covid:rural1 
##                                    2.273979

Check residuals

# Extract residuals and fitted values
df_resid <- data.frame(Fitted = fitted(qpois), Residuals = residuals(qpois))

# Plot residuals
ggplot(df_resid, aes(x = Fitted, y = Residuals)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "loess", color = "red") +
  theme_minimal() +
  labs(title = "Residuals vs Fitted Values")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Failed to fit group -1.
## Caused by error in `predLoess()`:
## ! workspace required (7915126834) is too large probably because of setting 'se = TRUE'.